knitr::opts_chunk$set(echo=TRUE, include=TRUE, message=TRUE, 
                      warning=FALSE, paged.print=FALSE, fig.width = 7)

##Important!! Enter credential information. Include last "/" of the baseurl
##ENTER BASE URL, USERNAME, PASSWORD
baseurl<-params$baseurl
username<-params$username

##Load required packages
required_packages<-c("tidyverse","httr","assertthat","readr","knitr",
                    "tidyr","jsonlite", "keyring", "lubridate","survival","survminer",
                    "GGally","ggfortify")


is_installed <- function(mypkg) is.element(mypkg, installed.packages()[,1])

load_or_install<-function(required_packages) {
  for(package_name in required_packages)  {
    if(!is_installed(package_name))     {
       install.packages(package_name)  }
    library(package_name,character.only=TRUE,quietly=TRUE,verbose=FALSE)
  }
}

load_or_install(required_packages)

From HIV Case Surveillance in DHIS2 to Kaplan-Meier Curves in R

The WHO HIV Case Based surveillance metadata package can be used to capture patient-level data on ART regimens over time, including date of ART initiation and patient status. However, DHIS2 analytics are not advanced enough to build Cox Proportional Hazards models, or survival curves like below. These can be done in R with the survminer and survival packages.

ggsurvplot(
    fit = survfit(Surv(time, status) ~ 1, data = lung), 
    xlab = "Days", 
    ylab = "Overall survival probability",
    title="Generic survival curve")

This demo will show how to download the DHIS2 Event Reports required to build survival curves in an RMarkdown report.

Enrollment Line Listing

Data have been collected from a demo DHIS2 database that uses the WHO metadata package for HIV case surveillance.

This event report shows the latest Visit stage for all enrollments from the past 5 years and present year.

The variables included are…

  • Gender : Tracked Entity Attribute for patient (options: Male, Female, Transgender)
  • Days since Treatment Start : Enrollment-type program indicator calculating days between ART initiation date and the latest Visit recorded (repeatable stage) for each patient
  • PLHIV with Viral Load Suppression : Data element for patient viral load <1000 copies/ml (suppressed) as of latest visit
  • Treatment Status : Data element for patient status as of latest visit (options: On ART, Death Reported, Transferred, Stopped Treatment, Lost to Follow Up)

Import Data

Two ways to import the data: download and then read CSV, or import directly through API.

To download the event report, go to Download > CSV

Data from the export could be then read into R for analysis with the readr::read_csv() function.

To import through DHIS2 API, the first thing to do is to enter credentials. On the first time running this script, you will be asked for a password to log-in to the demo system.

Once the keyring package stores and encrypts the password as a local environment variable, you can comment out that password set function Next time you run the script on this machine, you will not need to enter the password.

Using the API in this way allows for more process automation: run the script once, and later you can execute it routinely with a Cron job to access and publish the latest data.

#youll now be prompted for password
# if you hav already set the password with keyring, you can comment this out
# keyring::key_set("Password", username=username)

##test login
loginDHIS2<-function(baseurl,username,password) {
  url<-paste0(baseurl,"api/me")
  r<-GET(url,authenticate(username, 
                          keyring::key_get("Password", username=username)))
  warn_for_status(r, task="log in")
  if(r$status_code == 200L){return(TRUE)}
  }

if(loginDHIS2(baseurl, username, password)==TRUE){
  print("successfully logged in")
}else{
  stop("could not log in! Please check url, username and password")
}
## [1] "successfully logged in"

The API request url from the event report is found when going to Download > HTML . Simply change the link’s extension from .html+css to .csv

More details about the DHIS2 API can be found in the DHIS2 Developer Guide

read_csv() will show the columns imported and inferred column types

url<-paste0(baseurl,
            "api/29/analytics/enrollments/query/Xh88p1nyefp.csv?", #enrollment query in CSV
          "dimension=pe:THIS_YEAR;LAST_5_YEARS", #this year and last 5
          "&dimension=ou:USER_ORGUNIT",#user orgunit
          "&dimension=Jt68iauILtD", #Gender attribute
          "&dimension=ang4CLldbIu.S4IJiirQVHY", # VL test result
          "&dimension=ang4CLldbIu.JyeF0Utlhfp:GT:1&", #days since ART > 1 (follow up visit) 
          "&dimension=ang4CLldbIu.rcUsYgOnlyF",  # latest status
          "&stage=ang4CLldbIu&displayProperty=NAME", #identify stage and the displayName
          "&tableLayout=true&dataIdScheme=NAME", #use name of DE
          "&columns=pe;ou;Jt68iauILtD;JyeF0Utlhfp;S4IJiirQVHY;rcUsYgOnlyF", #table layout
          "&rows=pe;ou;Jt68iauILtD;JyeF0Utlhfp;S4IJiirQVHY;rcUsYgOnlyF&paging=false") 


# function to fetch data from API
dta<-read_csv(httr::content(GET(url)))
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Enrollment = col_character(),
##   `Tracked entity instance` = col_character(),
##   `Enrollment date` = col_datetime(format = ""),
##   `Incident date` = col_datetime(format = ""),
##   Geometry = col_logical(),
##   Longitude = col_double(),
##   Latitude = col_double(),
##   `Organisation unit name` = col_character(),
##   `Organisation unit code` = col_character(),
##   `Organisation unit` = col_character(),
##   `Gender M, F, TG` = col_character(),
##   `HIV - Viral load <1000` = col_double(),
##   `HIV - Days Since Treatment Start` = col_double(),
##   `HIV - Treatment Status` = col_character()
## )

Cleaning Data

When cleaning the data, observations are considered CENSORED if the latest visit status was On ART or Transferred Out. That means the patient did not die or stop treatment before the end of the observation period.

CS_data<-dta %>% 
  janitor::clean_names() %>% ## clean variable names
  select(contains("enrollment"), contains("gender"),
         contains("viral"), contains("treatment")) %>% ## select vars
  rename("gender"=3, "vl_suppressed"=4, "days_since_start"=5,"status"=6) %>%  
  filter(status!="NA") %>%  # remove NA#
  mutate(vl_suppressed=if_else(is.na(vl_suppressed),0,1)) %>%  # replace Na with 0
  mutate(censored=if_else(str_detect(status,c("ART|Transfer")), 1, 2)) 
# censor iff latest status On ART or transfer, otherwise status implies death/ltfu


#show first rows...
head(CS_data) %>% 
  kable()
enrollment enrollment_date gender vl_suppressed days_since_start status censored
sN4rsb88a5H 2020-02-12 Male 0 414 On ART 1
TdDo0b6IZg3 2020-10-25 Female 0 189 On ART 1
T7epRIGcG00 2021-03-11 Female 0 29 On ART 1
LH2ZsCJZxQv 2019-12-26 Female 0 497 On ART 1
bBKF1KYului 2019-08-25 Transgender 1 584 On ART 1
Pu1jPvNWMYy 2020-01-10 Transgender 1 455 On ART 1

Summaries of variables

##summarize data
CS_data %>% 
  select(enrollment_date,days_since_start) %>% 
  # split(.$status) %>% 
  map(summary)
## $enrollment_date
##                  Min.               1st Qu.                Median 
## "2018-12-02 00:00:00" "2019-05-23 00:00:00" "2019-11-06 00:00:00" 
##                  Mean               3rd Qu.                  Max. 
## "2019-11-20 03:47:41" "2020-04-03 00:00:00" "2021-04-23 00:00:00" 
## 
## $days_since_start
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    10.0   268.0   497.0   480.3   675.0  1027.0
##count by latest status
CS_data %>% 
  select(vl_suppressed, gender, status) %>% 
  map(summary.factor)
## $vl_suppressed
##    0    1 
## 2287 7326 
## 
## $gender
##      Female        Male Transgender 
##        5443        3772         398 
## 
## $status
##          Death (documented)           Lost to follow up 
##                         369                         335 
##                      On ART Refused (stopped) treatment 
##                        8159                         232 
##             Transferred out 
##                         518
## histogram
CS_data %>% 
  ggplot()+
  geom_histogram(aes(days_since_start))+
  facet_wrap(~status) + ## histogram by status
  labs(title="Histogram by Last Visit Status")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Specify Survival Function

Basic model without covariates, basic plot.

km_fit<-survfit(Surv(days_since_start, censored) ~ 1,
                 data=CS_data,
                 type="kaplan-meier")

km_fit
## Call: survfit(formula = Surv(days_since_start, censored) ~ 1, data = CS_data, 
##     type = "kaplan-meier")
## 
##       n  events  median 0.95LCL 0.95UCL 
##    9613     936    1027      NA      NA
plot(km_fit)

Add Covariates

Group by Gender

km_fit1<-survfit(Surv(days_since_start, censored) ~ gender,
                 data=CS_data,
                 type="kaplan-meier")

km_fit1
## Call: survfit(formula = Surv(days_since_start, censored) ~ gender, 
##     data = CS_data, type = "kaplan-meier")
## 
##                       n events median 0.95LCL 0.95UCL
## gender=Female      5443    502   1027      NA      NA
## gender=Male        3772    396   1027      NA      NA
## gender=Transgender  398     38     NA      NA      NA
# summary(km_fit1)

Group by VL Suppression

km_fit2<-survfit(Surv(days_since_start, censored) ~ vl_suppressed,
                 data=CS_data,
                 type="kaplan-meier")

km_fit2
## Call: survfit(formula = Surv(days_since_start, censored) ~ vl_suppressed, 
##     data = CS_data, type = "kaplan-meier")
## 
##                    n events median 0.95LCL 0.95UCL
## vl_suppressed=0 2287    350     NA      NA      NA
## vl_suppressed=1 7326    586   1027      NA      NA

Plot Survival Curve

the Kaplan-Meier model is plotted with the survminer R package.

Code adapted from: http://www.sthda.com/english/wiki/survminer-0-3-0

NB! These models are based on case surveillance dummy data and are for demonstration purposes only

By Gender

# fit <- survfit(Surv(time, status) ~ sex, data = lung)

plotObj<-ggsurvplot(km_fit1, data = CS_data,
           title = "HIV Survival on ART: By Gender",
           pval = TRUE, pval.method = TRUE,    # Add p-value &  method name
           legend.title = "VL<1000 copies/ml",  # Change legend titles
           palette = "lancet",                 # Use JCO journal color palette
           risk.table = TRUE,                  # Add No at risk table
           # cumevents = TRUE,                   # Add cumulative No of events table
           tables.height = 0.2,               # Specify tables height
           tables.theme = theme_cleantable(),  # Clean theme for tables
           tables.y.text = FALSE,              # Hide tables y axis text
           xlab = "Days Since Treatment Start",
           conf.int = TRUE,
           xlim = c(0,750),
           ylim = c(0.5,1)
)

# Now save the plot image
# ggsave("survivalplot.png", print(plotObj))

plotObj

By VL Suppression

# fit <- survfit(Surv(time, status) ~ sex, data = lung)

plotObj<-ggsurvplot(km_fit2, data = CS_data,
           title = "HIV Survival on ART",
           pval = TRUE, pval.method = TRUE,    # Add p-value &  method name
           legend.title = "",            # Change legend titles
           palette = "lancet",                 # Use JCO journal color palette
           risk.table = TRUE,                  # Add No at risk table
           # cumevents = TRUE,                   # Add cumulative No of events table
           tables.height = 0.2,               # Specify tables height
           tables.theme = theme_cleantable(),  # Clean theme for tables
           tables.y.text = FALSE,              # Hide tables y axis text
           xlab = "Days Since Treatment Start",
           conf.int = TRUE,
           xlim = c(0,750),
           ylim = c(0.5,1)
)

# Now save the plot image
ggsave("survivalplot.png", print(plotObj))
## Saving 7 x 5 in image
plotObj

More models

Comparing survival times between groups

survdiff(Surv(days_since_start, censored) ~ gender,
                 data=CS_data)
## Call:
## survdiff(formula = Surv(days_since_start, censored) ~ gender, 
##     data = CS_data)
## 
##                       N Observed Expected (O-E)^2/E (O-E)^2/V
## gender=Female      5443      502    521.4     0.721     1.763
## gender=Male        3772      396    371.5     1.621     2.915
## gender=Transgender  398       38     43.1     0.613     0.656
## 
##  Chisq= 3.2  on 2 degrees of freedom, p= 0.2

Cox regression model

broom::tidy(
  coxph(Surv(days_since_start, censored) ~ vl_suppressed, 
        data = CS_data), 
  exp = TRUE) %>% 
  kable()
term estimate std.error statistic p.value
vl_suppressed 0.101974 0.0713021 -32.0192 0

Next Steps

Other Analysis Workflows

  • Include other covariates, such as mode of transmission, age range, or CD4 count
  • Compare survival curves for annual cohorts or by geography (district, clinic, etc)
  • Use Knitr to turn RMarkdown doc into an HTML webpage, PDF, or slide deck
  • Run the script as a monthly CRON job to analyze latest data
  • Save survival curve as PNG file, share in report

Or… import CSV from DHIS2 into your choice of software e.g. STATA, SPSS

Other Resources for Survival Analyses


sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggfortify_0.4.11  GGally_2.1.1      survminer_0.4.9   ggpubr_0.4.0     
##  [5] survival_3.2-7    lubridate_1.7.9.2 keyring_1.1.0     jsonlite_1.7.2   
##  [9] knitr_1.31        assertthat_0.2.1  httr_1.4.2        forcats_0.5.1    
## [13] stringr_1.4.0     dplyr_1.0.3       purrr_0.3.4       readr_1.4.0      
## [17] tidyr_1.1.2       tibble_3.0.5      ggplot2_3.3.3     tidyverse_1.3.0  
## 
## loaded via a namespace (and not attached):
##  [1] ggtext_0.1.1       fs_1.5.0           RColorBrewer_1.1-2 ggsci_2.9         
##  [5] tools_4.0.2        backports_1.2.1    utf8_1.2.1         R6_2.5.0          
##  [9] DBI_1.1.1          colorspace_2.0-0   withr_2.4.2        tidyselect_1.1.1  
## [13] gridExtra_2.3      curl_4.3           compiler_4.0.2     cli_2.5.0         
## [17] rvest_0.3.6        xml2_1.3.2         labeling_0.4.2     scales_1.1.1      
## [21] survMisc_0.5.5     digest_0.6.27      foreign_0.8-81     rmarkdown_2.6     
## [25] rio_0.5.26         pkgconfig_2.0.3    htmltools_0.5.1.1  highr_0.9         
## [29] dbplyr_2.0.0       rlang_0.4.10       readxl_1.3.1       rstudioapi_0.13   
## [33] generics_0.1.0     farver_2.1.0       zoo_1.8-8          zip_2.1.1         
## [37] car_3.0-10         magrittr_2.0.1     Matrix_1.2-18      Rcpp_1.0.6        
## [41] munsell_0.5.0      fansi_0.4.2        abind_1.4-5        lifecycle_1.0.0   
## [45] stringi_1.5.3      yaml_2.2.1         snakecase_0.11.0   carData_3.0-4     
## [49] plyr_1.8.6         grid_4.0.2         crayon_1.4.1       lattice_0.20-41   
## [53] haven_2.3.1        splines_4.0.2      gridtext_0.1.4     hms_1.0.0         
## [57] pillar_1.6.0       markdown_1.1       ggsignif_0.6.1     reprex_1.0.0      
## [61] glue_1.4.2         evaluate_0.14      data.table_1.13.6  modelr_0.1.8      
## [65] vctrs_0.3.6        cellranger_1.1.0   gtable_0.3.0       reshape_0.8.8     
## [69] km.ci_0.5-2        xfun_0.20          openxlsx_4.2.3     janitor_2.1.0     
## [73] xtable_1.8-4       broom_0.7.6        rstatix_0.7.0      KMsurv_0.1-5      
## [77] ellipsis_0.3.1